# keep columns 1,2,5,6,7,9,10,16,17,18,19
p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]
rownames(p_e) <- p_e[[1]]
Without doing any reordering we cannot identify any clusters or outliers.
#p_e_sc %>%
plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
z = ~p_e_sc, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")
p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
We used method “OLO” as the Hierarchical Clustering algorithm instead of “HC” method, because according to the documentation in seriation package and @[hahsler] the former does not optimize the Hamiltonian Path Length.
# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))
order2 <-get_order(seriate(p_e_cdist, method = "OLO"))
p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
z = ~p_reord, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist - HC)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# computing distance as one minus correlation
p_e_cor <- as.dist((1 - cor(p_e_sc))/2)
p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
set.seed(1212)
# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))
ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))
# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
z = ~p_reord2, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Cor dist)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
The ordering by euclidean distance produces a heat map that is easier to analyze. At first glance we can perceive four general regions of two groups. The first group heat map color tends towards a brighter shade of red while the second group tend towards a darker shade of red/black. Although these groups can be seen in the correlation distance heat map, it is not as clear as the first.
Based on the euclidean distance heat map, net wage tends to higher values from Dubai while the number of hours worked decrease. This is the opposite to cities like Delhi,Bankok and Seoul. Interestingly food costs are generally low in the cities with highee working hours. Caracas is an outlier because food costs are high while net wage and the number of hours worked remains low.
# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)
ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))
set.seed(111)
order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))
p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
z = ~p_reord_q4, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist- TSP)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
From visual comparison of the heatmap produced by HC solver and TSP, we prefer the latter because there appears to be a separation along the anti diagonal such that cities that have similary high prices are placed along this diagonal. In the heatmap by the TSP solver this distinctionn is not quite clear.
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order
# set seed
set.seed(222)
or1 <- seriate(p_e_rdist, method = "OLO")
set.seed(333)
or2 <- seriate(p_e_rdist, method = "TSP")
result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))
result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
# criterion on HC solver and unordered
result1
## 2SUM AR_deviations AR_events BAR Cor_R
## unordered 1012004.5 107139.67 61656 29259.38 0.04268063
## ordered 756120.2 20296.15 26876 19055.56 0.20713342
## Gradient_raw Gradient_weighted Inertia Lazy_path_length
## unordered -4032 -11081.08 17886336 10126.431
## ordered 65528 156151.06 24666913 3713.804
## Least_squares LS ME Moore_stress Neumann_stress
## unordered 3575435 1006126.8 568.2673 986.9925 553.9889
## ordered 3352459 894638.7 652.4429 411.5392 239.0233
## Path_length RGAR
## unordered 281.7269 0.5169014
## ordered 121.9671 0.2253186
# criterion on TSP and unordered.
result2
## 2SUM AR_deviations AR_events BAR Cor_R
## unordered 1012004.5 107139.67 61656 29259.38 0.04268063
## ordered 838955.9 47893.87 40089 20181.60 0.11204948
## Gradient_raw Gradient_weighted Inertia Lazy_path_length
## unordered -4032 -11081.08 17886336 10126.431
## ordered 39102 89163.24 21209322 4540.624
## Least_squares LS ME Moore_stress Neumann_stress
## unordered 3575435 1006126.8 568.2673 986.9925 553.9889
## ordered 3441776 939297.2 653.2224 413.0466 237.8996
## Path_length RGAR
## unordered 281.7269 0.5169014
## ordered 119.0998 0.3360915
TSP solver has shorter path length (121.718) compared to HC solver (121.967) by a very small margin of .249. Thus it does a slighlty better job of optimizing the Hamiltonian Path Length. For measure of effectiveness (ME) the HC solver (ME = 652.44) has advantage over the TSP (ME = 651.598). A higher ME implies a better arrangement of the dissimilarity matrix. For gradient measures since objective is to increase the distance from main diagonal, we infer that HC (65528) does a better job of achieving this compared to TSP solver(41312).
# parallel coordinates plot from unsorted scaled data
p_e_sc2 <- as.data.frame(p_e_sc)
p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
We can identify two clusters defined by Wage net (blue) and iphone 4s (red). Wage net has values greater than 0 in the red cluster (defined by iphone 4) while iphone has values has values greater than -0.5 in the blue cluster. These clusters are difficult to interpret. The most prominent outlier in the blue cluster is the price of Rice per kg, The lines at this variable are furthest apart . It does not seem to fit into any of the two clusters.
# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
p_ord_HC$name <- rownames(p_ord_HC)
# colapse the p_ord_HC to long format
p_ord_HC %>%
tidyr::gather(variable, value, -name, factor_key = T) %>%
arrange(name) %>%
ggplot(aes(x=variable, y=value, group=name)) +
geom_polygon(fill="#F81894") +
coord_polar() + theme_classic() +
facet_wrap(~ name) #+
#theme(axis.text.x = element_text(size = 5))
Cairo and Delhi are outliers with respect to the price of bread per kg. Prague, Johanessburg and Panama form a cluster the prices are comparatively small and similar across the products.. The most distinct outlier would be Caracas. The prices aååear higher due to the size of the radar chart.
Among Heatmaps, paralled coordinates and radar charts, heatmaps are relatively easier to interprate in terms of time and accuracy. Radar cahrts are hard to interprate when the number of variables to compare a re high. Parallel coordiantes are generally messy to work with.
The points in the scatter plot are very close to each other and so, it is difficult to interpret the results for both <=50K and >50K income levels. The trellis plot shows both income levels in separate panels and so, it is quite easy to analyse the datas. The people with age range of 0-25 who earns <=50K are working for long hours than people earning >50K.
plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
geom_point(size=1)
plot1
plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
facet_wrap(~income_level)
plot2
The density curve is skewed to the right (right skewed distribution) for income level <=50K and thus not normally distributed.The median income level for <=50K is less than the mean. Whereas the the density curve for income of >50K is bimodal. The trellis plot exhibits both income level for people with various marital status. The density curve for widower is normally distributed for both income levels. The people who were never married earnig <=50K lies within range of 17-40 age group.
plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3
plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4
In 3D-scatter plot, the overlapping of data points around the same values makes it difficult to analyse the relationship between given three varaibles. In raster type 2D-density plot, every panels of age group 29-90 exhibits almost same outputs for capital loss ranging 1000-2000 except age group of 17-29.
df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5
The patterns in every panels of trellis plot looks almost similar except the last panel i.e, age group of 48-90. The capital loss seems to be quite high for both age group of 37-48 and 48-90.
plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
facet_grid(cut_number(df$age,4)~.)
plot6a
Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-df[index,]
df4$Class<-as.factor(Class)
ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
facet_grid(Class~., labeller = "label_both")
library(ggplot2)
library(seriation)
library(scales)
library(tidyr)
library(dplyr)
library(lattice)
library(plotly)
prices_earnings <- read.delim("prices-and-earnings.txt")
# keep columns 1,2,5,6,7,9,10,16,17,18,19
p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]
rownames(p_e) <- p_e[[1]]
# question 2 scaling
p_e_sc <- scale(p_e[,-1])
#p_e_sc %>%
plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
z = ~p_e_sc, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")
p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))
order2 <-get_order(seriate(p_e_cdist, method = "OLO"))
p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
z = ~p_reord, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist - HC)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# computing distance as one minus correlation
p_e_cor <- as.dist((1 - cor(p_e_sc))/2)
p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
set.seed(1212)
# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))
ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))
# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
z = ~p_reord2, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Cor dist)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)
ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))
set.seed(111)
order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))
p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
z = ~p_reord_q4, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist- TSP)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order
# set seed
set.seed(222)
or1 <- seriate(p_e_rdist, method = "OLO")
set.seed(333)
or2 <- seriate(p_e_rdist, method = "TSP")
result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))
result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
# criterion on HC solver and unordered
result1
# criterion on TSP and unordered.
result2
# parallel coordinates plot from unsorted scaled data
p_e_sc2 <- as.data.frame(p_e_sc)
p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
p_ord_HC$name <- rownames(p_ord_HC)
# colapse the p_ord_HC to long format
p_ord_HC %>%
tidyr::gather(variable, value, -name, factor_key = T) %>%
arrange(name) %>%
ggplot(aes(x=variable, y=value, group=name)) +
geom_polygon(fill="#F81894") +
coord_polar() + theme_classic() +
facet_wrap(~ name) #+
#theme(axis.text.x = element_text(size = 5))
df <- read.csv("adult.csv")
colnames(df) <- c("age","workclass","fnlwgt","edu","edu_num","marital_status","occupation","relationship","race","sex","cap_gain","cap_loss","hours_per_week","native_country","income_level")
#View(df)
plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
geom_point(size=1)
plot1
plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
facet_wrap(~income_level)
plot2
plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3
plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4
df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5
plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
facet_grid(cut_number(df$age,4)~.)
plot6a
Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-df[index,]
df4$Class<-as.factor(Class)
ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
facet_grid(Class~., labeller = "label_both")